{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 隐语义模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 导入包\n",
    "import random\n",
    "import math\n",
    "import numpy as np\n",
    "import time\n",
    "from tqdm import tqdm, trange"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 一. 通用函数定义"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 定义装饰器，监控运行时间\n",
    "def timmer(func):\n",
    "    def wrapper(*args, **kwargs):\n",
    "        start_time = time.time()\n",
    "        res = func(*args, **kwargs)\n",
    "        stop_time = time.time()\n",
    "        print('Func %s, run time: %s' % (func.__name__, stop_time - start_time))\n",
    "        return res\n",
    "    return wrapper"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1. 数据处理相关\n",
    "1. load data\n",
    "2. split data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class Dataset():\n",
    "    \n",
    "    def __init__(self, fp):\n",
    "        # fp: data file path\n",
    "        self.data = self.loadData(fp)\n",
    "    \n",
    "    @timmer\n",
    "    def loadData(self, fp):\n",
    "        data = []\n",
    "        for l in open(fp):\n",
    "            data.append(tuple(map(int, l.strip().split('::')[:2])))\n",
    "        return data\n",
    "    \n",
    "    @timmer\n",
    "    def splitData(self, M, k, seed=1):\n",
    "        '''\n",
    "        :params: data, 加载的所有(user, item)数据条目\n",
    "        :params: M, 划分的数目，最后需要取M折的平均\n",
    "        :params: k, 本次是第几次划分，k~[0, M)\n",
    "        :params: seed, random的种子数，对于不同的k应设置成一样的\n",
    "        :return: train, test\n",
    "        '''\n",
    "        train, test = [], []\n",
    "        random.seed(seed)\n",
    "        for user, item in self.data:\n",
    "            # 这里与书中的不一致，本人认为取M-1较为合理，因randint是左右都覆盖的\n",
    "            if random.randint(0, M-1) == k:  \n",
    "                test.append((user, item))\n",
    "            else:\n",
    "                train.append((user, item))\n",
    "\n",
    "        # 处理成字典的形式，user->set(items)\n",
    "        def convert_dict(data):\n",
    "            data_dict = {}\n",
    "            for user, item in data:\n",
    "                if user not in data_dict:\n",
    "                    data_dict[user] = set()\n",
    "                data_dict[user].add(item)\n",
    "            data_dict = {k: list(data_dict[k]) for k in data_dict}\n",
    "            return data_dict\n",
    "\n",
    "        return convert_dict(train), convert_dict(test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2. 评价指标\n",
    "1. Precision\n",
    "2. Recall\n",
    "3. Coverage\n",
    "4. Popularity(Novelty)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class Metric():\n",
    "    \n",
    "    def __init__(self, train, test, GetRecommendation):\n",
    "        '''\n",
    "        :params: train, 训练数据\n",
    "        :params: test, 测试数据\n",
    "        :params: GetRecommendation, 为某个用户获取推荐物品的接口函数\n",
    "        '''\n",
    "        self.train = train\n",
    "        self.test = test\n",
    "        self.GetRecommendation = GetRecommendation\n",
    "        self.recs = self.getRec()\n",
    "        \n",
    "    # 为test中的每个用户进行推荐\n",
    "    def getRec(self):\n",
    "        recs = {}\n",
    "        for user in self.test:\n",
    "            rank = self.GetRecommendation(user)\n",
    "            recs[user] = rank\n",
    "        return recs\n",
    "        \n",
    "    # 定义精确率指标计算方式\n",
    "    def precision(self):\n",
    "        all, hit = 0, 0\n",
    "        for user in self.test:\n",
    "            test_items = set(self.test[user])\n",
    "            rank = self.recs[user]\n",
    "            for item, score in rank:\n",
    "                if item in test_items:\n",
    "                    hit += 1\n",
    "            all += len(rank)\n",
    "        return round(hit / all * 100, 2)\n",
    "    \n",
    "    # 定义召回率指标计算方式\n",
    "    def recall(self):\n",
    "        all, hit = 0, 0\n",
    "        for user in self.test:\n",
    "            test_items = set(self.test[user])\n",
    "            rank = self.recs[user]\n",
    "            for item, score in rank:\n",
    "                if item in test_items:\n",
    "                    hit += 1\n",
    "            all += len(test_items)\n",
    "        return round(hit / all * 100, 2)\n",
    "    \n",
    "    # 定义覆盖率指标计算方式\n",
    "    def coverage(self):\n",
    "        all_item, recom_item = set(), set()\n",
    "        for user in self.test:\n",
    "            for item in self.train[user]:\n",
    "                all_item.add(item)\n",
    "            rank = self.recs[user]\n",
    "            for item, score in rank:\n",
    "                recom_item.add(item)\n",
    "        return round(len(recom_item) / len(all_item) * 100, 2)\n",
    "    \n",
    "    # 定义新颖度指标计算方式\n",
    "    def popularity(self):\n",
    "        # 计算物品的流行度\n",
    "        item_pop = {}\n",
    "        for user in self.train:\n",
    "            for item in self.train[user]:\n",
    "                if item not in item_pop:\n",
    "                    item_pop[item] = 0\n",
    "                item_pop[item] += 1\n",
    "\n",
    "        num, pop = 0, 0\n",
    "        for user in self.test:\n",
    "            rank = self.recs[user]\n",
    "            for item, score in rank:\n",
    "                # 取对数，防止因长尾问题带来的被流行物品所主导\n",
    "                pop += math.log(1 + item_pop[item])\n",
    "                num += 1\n",
    "        return round(pop / num, 6)\n",
    "    \n",
    "    def eval(self):\n",
    "        metric = {'Precision': self.precision(),\n",
    "                  'Recall': self.recall(),\n",
    "                  'Coverage': self.coverage(),\n",
    "                  'Popularity': self.popularity()}\n",
    "        print('Metric:', metric)\n",
    "        return metric"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 二. LFM算法实现"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def LFM(train, ratio, K, lr, step, lmbda, N):\n",
    "    '''\n",
    "    :params: train, 训练数据\n",
    "    :params: ratio, 负采样的正负比例\n",
    "    :params: K, 隐语义个数\n",
    "    :params: lr, 初始学习率\n",
    "    :params: step, 迭代次数\n",
    "    :params: lmbda, 正则化系数\n",
    "    :params: N, 推荐TopN物品的个数\n",
    "    :return: GetRecommendation, 获取推荐结果的接口\n",
    "    '''\n",
    "    \n",
    "    all_items = {}\n",
    "    for user in train:\n",
    "        for item in train[user]:\n",
    "            if item not in all_items:\n",
    "                all_items[item] = 0\n",
    "            all_items[item] += 1\n",
    "    all_items = list(all_items.items())\n",
    "    items = [x[0] for x in all_items]\n",
    "    pops = [x[1] for x in all_items]\n",
    "    \n",
    "    # 负采样函数(注意！！！要按照流行度进行采样)\n",
    "    def nSample(data, ratio):\n",
    "        new_data = {}\n",
    "        # 正样本\n",
    "        for user in data:\n",
    "            if user not in new_data:\n",
    "                new_data[user] = {}\n",
    "            for item in data[user]:\n",
    "                new_data[user][item] = 1\n",
    "        # 负样本\n",
    "        for user in new_data:\n",
    "            seen = set(new_data[user])\n",
    "            pos_num = len(seen)\n",
    "            item = np.random.choice(items, int(pos_num * ratio * 3), pops)\n",
    "            item = [x for x in item if x not in seen][:int(pos_num * ratio)]\n",
    "            new_data[user].update({x: 0 for x in item})\n",
    "        \n",
    "        return new_data\n",
    "                \n",
    "    # 训练\n",
    "    P, Q = {}, {}\n",
    "    for user in train:\n",
    "        P[user] = np.random.random(K)\n",
    "    for item in items:\n",
    "        Q[item] = np.random.random(K)\n",
    "            \n",
    "    for s in trange(step):\n",
    "        data = nSample(train, ratio)\n",
    "        for user in data:\n",
    "            for item in data[user]:\n",
    "                eui = data[user][item] - (P[user] * Q[item]).sum()\n",
    "                P[user] += lr * (Q[item] * eui - lmbda * P[user])\n",
    "                Q[item] += lr * (P[user] * eui - lmbda * Q[item])\n",
    "        lr *= 0.9 # 调整学习率\n",
    "        \n",
    "    # 获取接口函数\n",
    "    def GetRecommendation(user):\n",
    "        seen_items = set(train[user])\n",
    "        recs = {}\n",
    "        for item in items:\n",
    "            if item not in seen_items:\n",
    "                recs[item] = (P[user] * Q[item]).sum()\n",
    "        recs = list(sorted(recs.items(), key=lambda x: x[1], reverse=True))[:N]\n",
    "        return recs\n",
    "    \n",
    "    return GetRecommendation"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 三. LFM实验\n",
    "M=8, N=10, ratio=[1, 2, 3, 5, 10, 20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class Experiment():\n",
    "    \n",
    "    def __init__(self, M, N, ratio=1,\n",
    "                 K=100, lr=0.02, step=100, lmbda=0.01, fp='../dataset/ml-1m/ratings.dat'):\n",
    "        '''\n",
    "        :params: M, 进行多少次实验\n",
    "        :params: N, TopN推荐物品的个数\n",
    "        :params: ratio, 正负样本比例\n",
    "        :params: K, 隐语义个数\n",
    "        :params: lr, 学习率\n",
    "        :params: step, 训练步数\n",
    "        :params: lmbda, 正则化系数\n",
    "        :params: fp, 数据文件路径\n",
    "        '''\n",
    "        self.M = M\n",
    "        self.K = K\n",
    "        self.N = N\n",
    "        self.ratio = ratio\n",
    "        self.lr = lr\n",
    "        self.step = step\n",
    "        self.lmbda = lmbda\n",
    "        self.fp = fp\n",
    "        self.alg = LFM\n",
    "    \n",
    "    # 定义单次实验\n",
    "    @timmer\n",
    "    def worker(self, train, test):\n",
    "        '''\n",
    "        :params: train, 训练数据集\n",
    "        :params: test, 测试数据集\n",
    "        :return: 各指标的值\n",
    "        '''\n",
    "        getRecommendation = self.alg(train, self.ratio, self.K, \n",
    "                                     self.lr, self.step, self.lmbda, self.N)\n",
    "        metric = Metric(train, test, getRecommendation)\n",
    "        return metric.eval()\n",
    "    \n",
    "    # 多次实验取平均\n",
    "    @timmer\n",
    "    def run(self):\n",
    "        metrics = {'Precision': 0, 'Recall': 0, \n",
    "                   'Coverage': 0, 'Popularity': 0}\n",
    "        dataset = Dataset(self.fp)\n",
    "        for ii in range(self.M):\n",
    "            train, test = dataset.splitData(self.M, ii)\n",
    "            print('Experiment {}:'.format(ii))\n",
    "            metric = self.worker(train, test)\n",
    "            metrics = {k: metrics[k]+metric[k] for k in metrics}\n",
    "        metrics = {k: metrics[k] / self.M for k in metrics}\n",
    "        print('Average Result (M={}, N={}, ratio={}): {}'.format(\\\n",
    "                              self.M, self.N, self.ratio, metrics))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# LFM实验(运行时间较长，这里没贴实验结果)\n",
    "M, N = 8, 10\n",
    "for r in [1, 2, 3, 5, 10, 20]:\n",
    "    exp = Experiment(M, N, ratio=r)\n",
    "    exp.run()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 四. 总结\n",
    "1. 负采样的时候要按照流行度进行采样。按照书中的说法：一般认为，很热门而用户却没有行为更加代表用户对这个物品不感兴趣，因为对于冷门的物品，用户可能是压根没有在网站中发现这个物品，所以谈不上是否感兴趣。"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
